In [1]:
import json
DATA_PATH = 'cleaned_data.json'
all_data = json.load(open(DATA_PATH))
In [2]:
import pandas as pd
import numpy as np
answers = pd.DataFrame(all_data)
answers['duration'] = answers.submit_time - answers.srv_time
answers.user_id = answers.user_id.astype(np.int)
# Clamp response time to 2 minutes.
answers.duration[answers.duration > 120] = 120
num_answered = answers.user_id.value_counts()
completions = num_answered[num_answered > 10].index
answers = answers[answers.user_id.isin(completions)]
# Some of the survey experiments did not get many responses, filter them out
source_counts = answers['type'].value_counts()
In [3]:
# So like 3 people completed surveys from twitter :)...
source_counts
Out[3]:
In [4]:
answers[0:3]
Out[4]:
In [5]:
answers = answers[answers.type.isin(source_counts[source_counts > 100].index)]
In [6]:
import numpy as np
grouped = answers.groupby(['type', 'question_id'])['duration'].agg({'mean': np.mean, 'count': len, 'std': np.std})
grouped
Out[6]:
In [7]:
questions = json.load(open('../app/survey.json'))
text_by_id = {q['id']: q['question'] for q in questions['questions']}
for q in questions['questions']:
if 'prompt' not in q:
continue
answers.answer[(answers.question_id == q['id']) & (answers['answer'] == q['prompt'])] = 'DEFAULT'
In [8]:
print(answers['type'].unique())
answers['type'].value_counts()
Out[8]:
I'm just going to work on finding the timing differences between the 5 cent turk answers and the "Do it fast" turk answers.
In [9]:
%pylab inline
import matplotlib.pyplot as plt
def plot_single_question(question_id, survey_answers, answer_types,
normalize=False, ax=None):
durations = [survey_answers[(survey_answers.type == t) &
(survey_answers.question_id == question_id)].duration
for t in answer_types]
if normalize:
weights = [np.ones_like(d) / len(d) for d in durations]
else:
weights = None
plt.hist(durations, label=list(answer_types), weights=weights)
ax.legend(bbox_to_anchor=(1.7, .95))
title(text_by_id[question_id][:80] + ' (question {})'.format(question_id))
def plot_all_questions(survey_answers, answer_types, question_ids, normalize=False):
size = 5
plt.figure(figsize=(size, len(question_ids) * size))
for (i, q) in enumerate(question_ids):
ax = plt.subplot(len(question_ids), 1, i + 1)
plot_single_question(q, survey_answers, answer_types, normalize=normalize, ax=ax)
In [10]:
plot_all_questions(answers[answers.duration < 50],
['Turk, asking for Fast', 'Mechanical Turk 5 Cents'],
list(1 + i for i in range(11)),
normalize=True)
In [11]:
from scipy.stats import gaussian_kde
(fraudy, legit) = ('Turk, asking for Fast', 'Mechanical Turk 5 Cents')
fraudy_timings = {}
legit_timings = {}
for i in range(11):
q_id = i + 1
q_frame = answers.duration[answers.question_id == q_id]
fraudy_timings[q_id] = gaussian_kde(q_frame[answers.type == fraudy])
legit_timings[q_id] = gaussian_kde(q_frame[answers.type == legit])
In [12]:
SIZE = 5
NUM_QUESTIONS = len(legit_timings)
plt.figure(figsize=(SIZE, SIZE * NUM_QUESTIONS))
for i in sorted(fraudy_timings):
plt.subplot(NUM_QUESTIONS, 1, i)
title(text_by_id[i][:80] + ' (question {})'.format(i))
fraud_kde = fraudy_timings[i]
legit_kde = legit_timings[i]
x = np.arange(0, 120, .1)
plt.yticks([])
plt.xlabel('Seconds to answer')
plt.plot(x, fraud_kde.evaluate(x), 'r', label='more fraudulent')
plt.plot(x, legit_kde.evaluate(x), 'g', label='more good')
plt.legend()
Here the start of digging into some of the actual answer data, just for fun.
In [13]:
import re
political_regex = re.compile('.*(obama|jfk|kennedy|ronald|reagan|regan|clinton|bill cl|'
'george washington|george w|dukakis|saddam|'
'bush|carter|nixon|modi|gorbachev|lincoln|trudeau|'
'brezhnev|perot|'
'mahatma gandhi|nehru|gingrich|martin luther king|mlk|'
'rajiv gandhi|ford|rajive gandhi|eisenhower|'
'rahul gandhi|indira gandhi|nelson mandela|white house|'
'gandhi|thatcher).*', re.IGNORECASE)
answers['figure'] = answers[answers.question_id == 3]['answer'].str.match(political_regex)
def get_first(l):
if type(l) == tuple and l:
return l[0].lower()
answers.figure_clean = answers.figure.apply(get_first)
In [14]:
answers.figure_clean.value_counts()
Out[14]:
In [15]:
# Someone famous in India who I had no clue about:
answers.answer[answers.figure_clean == 'modi']
Out[15]:
In [16]:
# Find the folks not captured by that regex:
def empty_tuple(x):
return type(x) == type([]) and not x
list(answers[answers.figure.apply(empty_tuple) & (answers.answer != 'DEFAULT')].answer)
Out[16]:
In [17]:
def score_for_user(user_id, initial_fraud_probability=.1):
fraud_probability = initial_fraud_probability
nonfraud_probability = 1 - initial_fraud_probability
the_data = answers[answers.user_id == user_id].sort(columns=('question_id',))[[
'question_id', 'answer', 'duration']]
partial_results = []
for r in the_data.iterrows():
(question_id, answer, duration) = r[1]
fraud_likelihood = fraudy_timings[question_id].evaluate(duration)[0]
fraud_probability *= fraud_likelihood
nonfraud_likelihood = legit_timings[question_id].evaluate(duration)[0]
nonfraud_probability *= nonfraud_likelihood
normalizer = nonfraud_probability + fraud_probability
fraud_probability /= normalizer
nonfraud_probability /= normalizer
partial_results.append({'question_id': question_id,
'duration': duration,
'answer': answer,
'fraud_p': fraud_probability,
'nonfraud_p': nonfraud_probability})
return partial_results
In [18]:
score_for_user(87397087779)
Out[18]:
In [19]:
answers.duration[answers.duration < 10].round(0).value_counts()
Out[19]:
In [20]:
all_scores = []
for u in answers.user_id.unique():
all_scores.append(score_for_user(u))
plt.hist([d[-1]['fraud_p'] for d in all_scores])
plt.xlabel('Final probability of fraud')
plt.ylabel('Number of people')
Out[20]:
In [21]:
#Looking at some example
from pprint import pprint
def get_instance(collection, lower, upper, function):
g = list(c for c in collection if lower <= function(c) <= upper)
if g:
return g[random.randint(0, len(g) - 1)]
get_fraud_p = lambda c: c[-1]['fraud_p']
print('Good surveys')
short_questions = {1: "First name",
2: "People with your name honest?",
3: "Earliest political memory?",
4: "Men or women need more exercise?",
5: "What country do you live in?",
6: "Allocating money to different departments",
7: "How saw would you be if various plants went away?",
8: "What animal would you not want to leave with a sheep?",
9: "10 kids, 1 evil kid, 0 kids, or 2 bad kids?",
10: "Do you have any idea what the word 'Telluride' means?",
11: "Who would your parents like?"}
def to_table(answers):
rows = ['<tr><td>{}</td><td>{}</td><td>{:0.3f}</td><td>{:0.5f}</td></tr>'
.format(short_questions[r['question_id']], r['answer'], r['duration'], r['fraud_p'])
for r in answers]
return ('<table>\n'
'<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>\n'
'\n{}\n'
'</table>'.format('\n'.join(rows)))
print('Good table\n', to_table(get_instance(all_scores, .0, .1, get_fraud_p)))
print('Bad table\n', to_table(get_instance(all_scores, .9, 1, get_fraud_p)))
In [22]:
pdf_data = {'range': [0, 120],
'step_size': .25}
pdf_values = {}
x = np.arange(pdf_data['range'][0], pdf_data['range'][1], pdf_data['step_size'])
for i in range(1, 12):
fraud_kde = fraudy_timings[i]
legit_kde = legit_timings[i]
pdf_values[i] = {'legit': list(legit_kde.evaluate(x)),
'fraudy': list(fraud_kde.evaluate(x))}
pdf_data['values'] = pdf_values
json.dump(pdf_data, open('fraud_model_pdf.json', 'wt'))
In [ ]: